{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 读取数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train = pd.read_csv('./data/train_set.csv')\n",
    "test = pd.read_csv('./data/test_set.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>article</th>\n",
       "      <th>word_seg</th>\n",
       "      <th>class</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0</td>\n",
       "      <td>7368 1252069 365865 755561 1044285 129532 1053...</td>\n",
       "      <td>816903 597526 520477 1179558 1033823 758724 63...</td>\n",
       "      <td>14</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>581131 165432 7368 957317 1197553 570900 33659...</td>\n",
       "      <td>90540 816903 441039 816903 569138 816903 10343...</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2</td>\n",
       "      <td>7368 87936 40494 490286 856005 641588 145611 1...</td>\n",
       "      <td>816903 1012629 957974 1033823 328210 947200 65...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3</td>\n",
       "      <td>299237 760651 299237 887082 159592 556634 7489...</td>\n",
       "      <td>563568 1239563 680125 780219 782805 1033823 19...</td>\n",
       "      <td>13</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4</td>\n",
       "      <td>7368 7368 7368 865510 7368 396966 995243 37685...</td>\n",
       "      <td>816903 816903 816903 139132 816903 312320 1103...</td>\n",
       "      <td>12</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   id                                            article  \\\n",
       "0   0  7368 1252069 365865 755561 1044285 129532 1053...   \n",
       "1   1  581131 165432 7368 957317 1197553 570900 33659...   \n",
       "2   2  7368 87936 40494 490286 856005 641588 145611 1...   \n",
       "3   3  299237 760651 299237 887082 159592 556634 7489...   \n",
       "4   4  7368 7368 7368 865510 7368 396966 995243 37685...   \n",
       "\n",
       "                                            word_seg  class  \n",
       "0  816903 597526 520477 1179558 1033823 758724 63...     14  \n",
       "1  90540 816903 441039 816903 569138 816903 10343...      3  \n",
       "2  816903 1012629 957974 1033823 328210 947200 65...     12  \n",
       "3  563568 1239563 680125 780219 782805 1033823 19...     13  \n",
       "4  816903 816903 816903 139132 816903 312320 1103...     12  "
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TFIDF构建文本特征"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "word_vec = TfidfVectorizer(analyzer='word',\n",
    "            ngram_range=(1,2),\n",
    "            min_df=3, \n",
    "            max_df=0.9,\n",
    "            use_idf=True,\n",
    "            smooth_idf=True, \n",
    "            sublinear_tf=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_term_doc = word_vec.fit_transform(train['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_term_doc = word_vec.transform(test['word_seg'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "102277"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_term_doc.shape[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 构建模型交叉模型"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## label转化"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "lb = LabelEncoder()\n",
    "train['label'] = lb.fit_transform(train['class'].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_term_doc,test_term_doc,train['label']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 定义交叉验证函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import KFold"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "#10折cv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "kf = KFold(n_splits=10, shuffle=True, random_state=666)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "#[102277,19]*5\n",
    "#"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_matrix = np.zeros((train.shape[0],19)) #记录验证集的概率\n",
    "##!!!!!\n",
    "\n",
    "\n",
    "test_pre_matrix = np.zeros((10,test.shape[0],19)) #将5轮的测试概率分别保存起来\n",
    "cv_scores=[] #每一轮线下的验证成绩"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "((10, 102277, 19), (102277, 19))"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pre_matrix.shape,train_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.metrics import f1_score\n",
    "def cal_macro_f1(y_true,y_pred):\n",
    "    score = f1_score(y_true,y_pred,average='macro')\n",
    "    return score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/hcq/miniconda3/envs/python3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning: Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
      "  FutureWarning)\n",
      "/home/hcq/miniconda3/envs/python3/lib/python3.6/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning: Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
      "  \"this warning.\", FutureWarning)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "validation score is 0.7613543467100502\n",
      "validation score is 0.7768677667455569\n",
      "validation score is 0.7728598168109526\n",
      "validation score is 0.7755113436928625\n",
      "validation score is 0.7760066478306171\n",
      "validation score is 0.7744242858674689\n",
      "validation score is 0.7728841713152601\n",
      "validation score is 0.7713283142117211\n",
      "validation score is 0.7683025318292591\n",
      "validation score is 0.7718469745843946\n"
     ]
    }
   ],
   "source": [
    "from sklearn.linear_model import LogisticRegression\n",
    "\n",
    "\n",
    "for i,(train_index,eval_index) in enumerate(kf.split(train_term_doc)):\n",
    "    print(len(train_index),len(eval_index))\n",
    "    \n",
    "    \n",
    "    #训练集\n",
    "    X_train = train_term_doc[train_index]\n",
    "    y_train = train['label'][train_index]\n",
    "    \n",
    "    #验证集\n",
    "    X_eval = train_term_doc[eval_index]\n",
    "    y_eval = train['label'][eval_index]\n",
    "    \n",
    "    model = LogisticRegression(C=4, dual=True) \n",
    "    model.fit(X_train,y_train)\n",
    "    \n",
    "    ####对于验证集进行预测\n",
    "    eval_prob = model.predict_proba(X_eval)\n",
    "    train_matrix[eval_index] = eval_prob.reshape((X_eval.shape[0], 19))#array\n",
    "    \n",
    "    eval_pred = np.argmax(eval_prob,axis=1)\n",
    "    eval_pred = lb.inverse_transform(eval_pred)\n",
    "    score = cal_macro_f1(lb.inverse_transform(y_eval),eval_pred)\n",
    "    cv_scores.append(score)\n",
    "    print(\"validation score is\",score)\n",
    "    \n",
    "    ###对于测试集进行预测\n",
    "    test_prob = model.predict_proba(test_term_doc)\n",
    "    test_pre_matrix[i,:,:] = test_prob.reshape((test_term_doc.shape[0], 19))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(102277, 19)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_matrix.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "all validation score is 0.7722242367841076\n"
     ]
    }
   ],
   "source": [
    "all_pred = np.argmax(train_matrix,axis=1)\n",
    "all_pred = lb.inverse_transform(all_pred)\n",
    "score = cal_macro_f1(lb.inverse_transform(train['label']),all_pred)\n",
    "print(\"all validation score is\",score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10, 102277, 19)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pre_matrix.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# 提交结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_pred = test_pre_matrix.mean(axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(102277, 19)"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_pred.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "test_pred = np.argmax(test_pred,axis=1)\n",
    "test_pred = lb.inverse_transform(test_pred)\n",
    "test['class'] = test_pred\n",
    "test[[\"id\",\"class\"]].to_csv(\"submission_baseline_cv.csv\",index=False,header=True,encoding='utf-8')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
